The Series Data Structure


In [2]:
import pandas as pd
pd.Series?

In [3]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)


Out[3]:
0    Tiger
1     Bear
2    Moose
dtype: object

In [4]:
numbers = [1,2,3,4]
pd.Series(numbers)


Out[4]:
0    1
1    2
2    3
3    4
dtype: int64

In [5]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)


Out[5]:
0    Tiger
1     Bear
2     None
dtype: object

In [6]:
numbers = [1, 2, None]
pd.Series(numbers)


Out[6]:
0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
import numpy as np

In [8]:
np.nan == None


Out[8]:
False

In [9]:
np.isnan(np.nan)


Out[9]:
True

In [10]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[10]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [11]:
s.index


Out[11]:
Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [12]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s


Out[12]:
India      Tiger
America     Bear
Canada     Moose
dtype: object

In [13]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s


Out[13]:
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series


In [21]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[21]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [22]:
s.iloc[3]


Out[22]:
'South Korea'

In [23]:
s.loc['Golf']


Out[23]:
'Scotland'

In [24]:
s[3]


Out[24]:
'South Korea'

In [25]:
s['Golf']


Out[25]:
'Scotland'

In [17]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [26]:
#s[0] #generates error

In [27]:
import numpy as np

In [28]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s


Out[28]:
0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [29]:
total = 0
for item in s:
    total+=item
print(total)


324.0

In [30]:
total = np.sum(s)
print(total)


324.0

Let's check efficienccy


In [32]:
s = pd.Series(np.random.randint(0,1000, 1000))
s.head()


Out[32]:
0    852
1    931
2    744
3    845
4    346
dtype: int32

In [33]:
len(s)


Out[33]:
1000

In [41]:
%%timeit 
summary = 0
for item in s:
    summary+=item


10000 loops, best of 3: 127 µs per loop

In [42]:
%%timeit
summary = np.sum(s)


10000 loops, best of 3: 102 µs per loop

In [43]:
s+=2 #adds two to each item in s using broadcasting
s.head()


Out[43]:
0    854
1    933
2    746
3    847
4    348
dtype: int32

In [44]:
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()


Out[44]:
0    856
1    935
2    748
3    849
4    350
dtype: int32

In [45]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2


10 loops, best of 3: 969 ms per loop

In [46]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


10 loops, best of 3: 325 µs per loop

In [47]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s


Out[47]:
0             1
1             2
2             3
Animal    Bears
dtype: object

In [48]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [49]:
original_sports #is not changed by append


Out[49]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [50]:
cricket_loving_countries


Out[50]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [51]:
all_countries


Out[51]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [53]:
all_countries.loc['Cricket'] #interesting


Out[53]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

DataFrame


In [54]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()


Out[54]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [58]:
df.loc['Store 2']['Cost']


Out[58]:
5.0

In [59]:
type(df.loc['Store 2'])


Out[59]:
pandas.core.series.Series

In [60]:
df.loc['Store 1', 'Cost']


Out[60]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [61]:
df.T


Out[61]:
Store 1 Store 1 Store 2
Cost 22.5 2.5 5
Item Purchased Dog Food Kitty Litter Bird Seed
Name Chris Kevyn Vinod

In [62]:
df.T.loc['Cost']


Out[62]:
Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

In [63]:
df['Cost']


Out[63]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [67]:
df.loc['Store 1']['Cost']


Out[67]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [68]:
df.loc[:, ['Name', 'Cost']]


Out[68]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0

In [69]:
df.drop('Store 1')


Out[69]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [70]:
df


Out[70]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [71]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df


Out[71]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [72]:
copy_df.drop?

In [73]:
del copy_df['Name']

In [74]:
df['Location'] = None
df.set_index()


Out[74]:
Cost Item Purchased Name Location
Store 1 22.5 Dog Food Chris None
Store 1 2.5 Kitty Litter Kevyn None
Store 2 5.0 Bird Seed Vinod None

Dataframe Indexing and Loading


In [75]:
costs = df['Cost']

In [76]:
costs+=2

In [77]:
df


Out[77]:
Cost Item Purchased Name Location
Store 1 24.5 Dog Food Chris None
Store 1 4.5 Kitty Litter Kevyn None
Store 2 7.0 Bird Seed Vinod None

In [78]:
!cat olympics.csv


'cat' is not recognized as an internal or external command,
operable program or batch file.

In [83]:
df['Cost'] = df['Cost']*0.8

In [85]:
df['Cost'] *= 0.8

In [86]:
pwd


Out[86]:
'C:\\Users\\Aroogz\\Desktop\\e-Class\\Coursera\\Introduction to data science in python\\Week 1'

In [88]:
df = pd.read_csv('../Week 2/olympics.csv')
df.head()


Out[88]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 NaN № Summer 01 ! 02 ! 03 ! Total № Winter 01 ! 02 ! 03 ! Total № Games 01 ! 02 ! 03 ! Combined total
1 Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
2 Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
3 Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
4 Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12

In [90]:
df = pd.read_csv('../Week 2/olympics.csv', skiprows=1, index_col=0)
df.head()


Out[90]:
№ Summer 01 ! 02 ! 03 ! Total № Winter 01 !.1 02 !.1 03 !.1 Total.1 № Games 01 !.2 02 !.2 03 !.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [91]:
df.columns


Out[91]:
Index(['№ Summer', '01 !', '02 !', '03 !', 'Total', '№ Winter', '01 !.1',
       '02 !.1', '03 !.1', 'Total.1', '№ Games', '01 !.2', '02 !.2', '03 !.2',
       'Combined total'],
      dtype='object')

In [92]:
for col in df.columns:
    if col[:2] == '01':
        df.rename(columns={col: 'Gold' + col[4:]}, inplace=True)
    if col[:2] == '02':
        df.rename(columns={col: 'Silver' + col[4:]}, inplace=True)
    if col[:2] == '03':
        df.rename(columns={col: 'Bronze' + col[4:]}, inplace=True)
    if col[:2] == '№':
        df.rename(columns={col: '#' + col[1:]}, inplace=True)
        
df.head()


Out[92]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

Querying a DataFrame


In [94]:
#df['Gold'] > 0

In [96]:
only_gold = df.where(df['Gold'] > 0)
only_gold.head()


Out[96]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0

In [98]:
only_gold = only_gold.dropna(axis=0)
only_gold.head()


Out[98]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0
Australia (AUS) [AUS] [Z] 25.0 139.0 152.0 177.0 468.0 18.0 5.0 3.0 4.0 12.0 43.0 144.0 155.0 181.0 480.0

In [100]:
# alternatively more concise
only_gold = df[df['Gold'] > 0]
only_gold.head()


Out[100]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480

In [101]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])


Out[101]:
101

In [102]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]


Out[102]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Liechtenstein (LIE) 16 0 0 0 0 18 2 2 5 9 34 2 2 5 9

Indexing Dataframes


In [103]:
df.head()


Out[103]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [108]:
df['Country'] = df.index

In [109]:
df = df.set_index('Gold')
df.head()


Out[109]:
№ Summer Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total Country
Gold
0 13 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
5 12 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
18 23 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
1 5 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
3 2 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

In [111]:
df = df.reset_index()

In [176]:
df = pd.read_csv('../Week 2/census.csv', encoding='iso-8859-1')

In [177]:
df.head()


Out[177]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
0 40 3 6 1 0 Alabama Alabama 4779736 4780127 4785161 ... 0.002295 -0.193196 0.381066 0.582002 -0.467369 1.030015 0.826644 1.383282 1.724718 0.712594
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861

5 rows × 100 columns


In [178]:
df['SUMLEV'].unique()


Out[178]:
array([40, 50], dtype=int64)

In [179]:
df = df[df['SUMLEV'] == 50]
df.head()


Out[179]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411

5 rows × 100 columns


In [180]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()


Out[180]:
STNAME CTYNAME BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
1 Alabama Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
2 Alabama Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
3 Alabama Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
4 Alabama Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
5 Alabama Blount County 183 744 710 646 618 603 57373 57711 57776 57734 57658 57673

In [181]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()


Out[181]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Alabama Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
Blount County 183 744 710 646 618 603 57373 57711 57776 57734 57658 57673

In [182]:
df.loc['Michigan', 'Washtenaw County']


Out[182]:
BIRTHS2010            977
BIRTHS2011           3826
BIRTHS2012           3780
BIRTHS2013           3662
BIRTHS2014           3683
BIRTHS2015           3709
POPESTIMATE2010    345563
POPESTIMATE2011    349048
POPESTIMATE2012    351213
POPESTIMATE2013    354289
POPESTIMATE2014    357029
POPESTIMATE2015    358880
Name: (Michigan, Washtenaw County), dtype: int64

In [183]:
df.loc[[('Michigan', 'Washtenaw County'),
       ('Michigan', 'Wayne County')]]


Out[183]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Michigan Washtenaw County 977 3826 3780 3662 3683 3709 345563 349048 351213 354289 357029 358880
Wayne County 5918 23819 23270 23377 23607 23586 1815199 1801273 1792514 1775713 1766008 1759335

Missing Values


In [169]:
df = pd.read_csv('../Week 2/log.csv')
df


Out[169]:
time user video playback position paused volume
0 1469974424 cheryl intro.html 5 False 10.0
1 1469974454 cheryl intro.html 6 NaN NaN
2 1469974544 cheryl intro.html 9 NaN NaN
3 1469974574 cheryl intro.html 10 NaN NaN
4 1469977514 bob intro.html 1 NaN NaN
5 1469977544 bob intro.html 1 NaN NaN
6 1469977574 bob intro.html 1 NaN NaN
7 1469977604 bob intro.html 1 NaN NaN
8 1469974604 cheryl intro.html 11 NaN NaN
9 1469974694 cheryl intro.html 14 NaN NaN
10 1469974724 cheryl intro.html 15 NaN NaN
11 1469974454 sue advanced.html 24 NaN NaN
12 1469974524 sue advanced.html 25 NaN NaN
13 1469974424 sue advanced.html 23 False 10.0
14 1469974554 sue advanced.html 26 NaN NaN
15 1469974624 sue advanced.html 27 NaN NaN
16 1469974654 sue advanced.html 28 NaN 5.0
17 1469974724 sue advanced.html 29 NaN NaN
18 1469974484 cheryl intro.html 7 NaN NaN
19 1469974514 cheryl intro.html 8 NaN NaN
20 1469974754 sue advanced.html 30 NaN NaN
21 1469974824 sue advanced.html 31 NaN NaN
22 1469974854 sue advanced.html 32 NaN NaN
23 1469974924 sue advanced.html 33 NaN NaN
24 1469977424 bob intro.html 1 True 10.0
25 1469977454 bob intro.html 1 NaN NaN
26 1469977484 bob intro.html 1 NaN NaN
27 1469977634 bob intro.html 1 NaN NaN
28 1469977664 bob intro.html 1 NaN NaN
29 1469974634 cheryl intro.html 12 NaN NaN
30 1469974664 cheryl intro.html 13 NaN NaN
31 1469977694 bob intro.html 1 NaN NaN
32 1469977724 bob intro.html 1 NaN NaN

In [170]:
df = df.set_index('time')
df = df.sort_index()
df


Out[170]:
user video playback position paused volume
time
1469974424 cheryl intro.html 5 False 10.0
1469974424 sue advanced.html 23 False 10.0
1469974454 cheryl intro.html 6 NaN NaN
1469974454 sue advanced.html 24 NaN NaN
1469974484 cheryl intro.html 7 NaN NaN
1469974514 cheryl intro.html 8 NaN NaN
1469974524 sue advanced.html 25 NaN NaN
1469974544 cheryl intro.html 9 NaN NaN
1469974554 sue advanced.html 26 NaN NaN
1469974574 cheryl intro.html 10 NaN NaN
1469974604 cheryl intro.html 11 NaN NaN
1469974624 sue advanced.html 27 NaN NaN
1469974634 cheryl intro.html 12 NaN NaN
1469974654 sue advanced.html 28 NaN 5.0
1469974664 cheryl intro.html 13 NaN NaN
1469974694 cheryl intro.html 14 NaN NaN
1469974724 cheryl intro.html 15 NaN NaN
1469974724 sue advanced.html 29 NaN NaN
1469974754 sue advanced.html 30 NaN NaN
1469974824 sue advanced.html 31 NaN NaN
1469974854 sue advanced.html 32 NaN NaN
1469974924 sue advanced.html 33 NaN NaN
1469977424 bob intro.html 1 True 10.0
1469977454 bob intro.html 1 NaN NaN
1469977484 bob intro.html 1 NaN NaN
1469977514 bob intro.html 1 NaN NaN
1469977544 bob intro.html 1 NaN NaN
1469977574 bob intro.html 1 NaN NaN
1469977604 bob intro.html 1 NaN NaN
1469977634 bob intro.html 1 NaN NaN
1469977664 bob intro.html 1 NaN NaN
1469977694 bob intro.html 1 NaN NaN
1469977724 bob intro.html 1 NaN NaN

In [171]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df


Out[171]:
video playback position paused volume
time user
1469974424 cheryl intro.html 5 False 10.0
sue advanced.html 23 False 10.0
1469974454 cheryl intro.html 6 NaN NaN
sue advanced.html 24 NaN NaN
1469974484 cheryl intro.html 7 NaN NaN
1469974514 cheryl intro.html 8 NaN NaN
1469974524 sue advanced.html 25 NaN NaN
1469974544 cheryl intro.html 9 NaN NaN
1469974554 sue advanced.html 26 NaN NaN
1469974574 cheryl intro.html 10 NaN NaN
1469974604 cheryl intro.html 11 NaN NaN
1469974624 sue advanced.html 27 NaN NaN
1469974634 cheryl intro.html 12 NaN NaN
1469974654 sue advanced.html 28 NaN 5.0
1469974664 cheryl intro.html 13 NaN NaN
1469974694 cheryl intro.html 14 NaN NaN
1469974724 cheryl intro.html 15 NaN NaN
sue advanced.html 29 NaN NaN
1469974754 sue advanced.html 30 NaN NaN
1469974824 sue advanced.html 31 NaN NaN
1469974854 sue advanced.html 32 NaN NaN
1469974924 sue advanced.html 33 NaN NaN
1469977424 bob intro.html 1 True 10.0
1469977454 bob intro.html 1 NaN NaN
1469977484 bob intro.html 1 NaN NaN
1469977514 bob intro.html 1 NaN NaN
1469977544 bob intro.html 1 NaN NaN
1469977574 bob intro.html 1 NaN NaN
1469977604 bob intro.html 1 NaN NaN
1469977634 bob intro.html 1 NaN NaN
1469977664 bob intro.html 1 NaN NaN
1469977694 bob intro.html 1 NaN NaN
1469977724 bob intro.html 1 NaN NaN

In [172]:
df = df.fillna(method= 'ffill')
df.head()


Out[172]:
video playback position paused volume
time user
1469974424 cheryl intro.html 5 False 10.0
sue advanced.html 23 False 10.0
1469974454 cheryl intro.html 6 False 10.0
sue advanced.html 24 False 10.0
1469974484 cheryl intro.html 7 False 10.0

In [ ]: